*1_desc_stats_june2022.do* 
*created: 11-3-21
*updated: 6-24-22
*authors: Meredith Welch 
*This code produces descriptives including means of analysis variables by sample and major group. This code also produces counts of obs by cip2 in each aggregate major group. 


/** SECTION 0: SET UP **/ 

global  outdir "/srv/tier1/projects/180_major/Majors"
global root "/srv/tier1/projects/180_major/Majors"
global data $root/1_data_cleaning/data 
global output_desc $root/2_data_analysis/desc_stats
global output_results $root/2_data_analysis/results
global tables $output_results/tables
global plots $output_results/plots 

cap log close
log using $output_results/1_desc_stats_june2022.log, replace

clear all 
set more off
set emptycells drop 
set matsize 11000
set seed 16554
set sortseed 132156
/** SECTION 0: SET UP CONTROL MACROS **/ 

*COMBINED SECTOR MAJORS MACRO
local major "ag com it voc eng bio sci soc bus und"
*DIFFERENT SECTORS MAJORS MACRO (including liberal arts)
local  majors "ag_4yr com_4yr it_4yr voc_4yr eng_4yr bio_4yr sci_4yr soc_4yr bus_4yr und_4yr lib_4yr ag_2yr com_2yr it_2yr voc_2yr eng_2yr bio_2yr sci_2yr soc_2yr bus_2yr educ_2yr und_2yr lib_2yr"
local  majors_2yr "ag_2yr com_2yr it_2yr voc_2yr eng_2yr bio_2yr sci_2yr soc_2yr bus_2yr educ_2yr und_2yr lib_2yr"
local  majors_4yr "ag_4yr com_4yr it_4yr voc_4yr eng_4yr bio_4yr sci_4yr soc_4yr bus_4yr und_4yr lib_4yr"


/** SECTION 1. MEANS OF ANALYSIS VARIABLES **/ 

// 4-YEAR SECTOR 

*Load data collapsed to one individual (4-year)
use $data/collapsed_data_4yr.dta, clear

*1a. Create table of demographics, test scores, and quarterly earnings for overall 4-year sample 
outreg2 using "$output_desc/tables/desc_4yr_overall.xls", replace keep(z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading male white hispanic black asian gift atrisk econ_disad wage_5to10_zero wage_10to15_zero wage_15to20_zero) sum(detail) eqkeep(N mean sd) dec(3) label  

outreg2 using "$output_desc/tables/desc_4yr_overall.txt", replace keep(z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading male white hispanic black asian gift atrisk econ_disad wage_5to10_zero wage_10to15_zero wage_15to20_zero) sum(detail) eqkeep(N mean sd) dec(3) label  


*1b. Create table of demographics, test scores, and quarterly earnings by major in 4-year sample 
preserve 
keep if ag_4yr==1
outreg2 using "$output_desc/tables/desc_4yr_major.txt", replace keep(z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading male white hispanic black asian gift atrisk econ_disad wage_5to10_zero wage_10to15_zero wage_15to20_zero) sum(detail) eqkeep(N mean sd) dec(3) label cttop("ag_4yr")

outreg2 using "$output_desc/tables/desc_4yr_major.xls", replace keep(z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading male white hispanic black asian gift atrisk econ_disad wage_5to10_zero wage_10to15_zero wage_15to20_zero) sum(detail) eqkeep(N mean sd) dec(3) label cttop("ag_4yr")
restore 

	*Write a loop over majors to append 
	foreach m in lib_4yr com_4yr it_4yr voc_4yr eng_4yr bio_4yr sci_4yr soc_4yr bus_4yr und_4yr {
	preserve
	keep if `m'==1 
	outreg2 using "$output_desc/tables/desc_4yr_major.txt", append keep(z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading male white hispanic black asian gift atrisk econ_disad wage_5to10_zero wage_10to15_zero wage_15to20_zero) sum(detail) eqkeep(N mean sd) dec(3) label cttop("`m'")
	
	outreg2 using "$output_desc/tables/desc_4yr_major.xls", append keep(z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading male white hispanic black asian gift atrisk econ_disad wage_5to10_zero wage_10to15_zero wage_15to20_zero) sum(detail) eqkeep(N mean sd) dec(3) label cttop("`m'")
	restore 
	}

 
*2. Create table with proportion of sample in each major 

** Create indicator for double major 
egen major_sum = rowtotal(lib_4yr ag_4yr com_4yr it_4yr voc_4yr eng_4yr bio_4yr sci_4yr soc_4yr bus_4yr und_4yr)
gen double_4yr = major_sum > 1

label variable double_4yr "Double Major 4yr Sector"

outreg2 using "$output_desc/tables/prop_4yr_major.txt", replace keep(lib_4yr ag_4yr com_4yr it_4yr voc_4yr eng_4yr bio_4yr sci_4yr soc_4yr bus_4yr und_4yr double_4yr) eqkeep(N mean) dec(3) label sum(detail)

outreg2 using "$output_desc/tables/prop_4yr_major.xls", replace keep(lib_4yr ag_4yr com_4yr it_4yr voc_4yr eng_4yr bio_4yr sci_4yr soc_4yr bus_4yr und_4yr double_4yr) eqkeep(N mean) dec(3) label sum(detail)




// 2-YEAR SECTOR 
*Load data collapsed to one individual (2-year)
use $data/collapsed_data_2yr.dta, clear

*1a. Create table of demographics, test scores, and quarterly earnings for overall 4-year sample 
outreg2 using "$output_desc/tables/desc_2yr_overall.xls", replace keep(z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading male white hispanic black asian gift atrisk econ_disad wage_5to10_zero wage_10to15_zero wage_15to20_zero) sum(detail) eqkeep(N mean sd) dec(3) label  

outreg2 using "$output_desc/tables/desc_2yr_overall.txt", replace keep(z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading male white hispanic black asian gift atrisk econ_disad wage_5to10_zero wage_10to15_zero wage_15to20_zero) sum(detail) eqkeep(N mean sd) dec(3) label  


*1b. Create table of demographics, test scores, and quarterly earnings by major in 4-year sample 
*majors: lib_2yr ag_2yr com_2yr it_2yr voc_2yr eng_2yr bio_2yr sci_2yr soc_2yr bus_2yr educ_2yr und_2yr
preserve 
keep if ag_2yr==1
outreg2 using "$output_desc/tables/desc_2yr_major.txt", replace keep(z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading male white hispanic black asian gift atrisk econ_disad wage_5to10_zero wage_10to15_zero wage_15to20_zero) sum(detail) eqkeep(N mean sd) dec(3) label cttop("ag_4yr")

outreg2 using "$output_desc/tables/desc_2yr_major.xls", replace keep(z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading male white hispanic black asian gift atrisk econ_disad wage_5to10_zero wage_10to15_zero wage_15to20_zero) sum(detail) eqkeep(N mean sd) dec(3) label cttop("ag_4yr")
restore 

	*Write a loop over majors to append 
	foreach m in lib_2yr com_2yr it_2yr voc_2yr eng_2yr bio_2yr sci_2yr soc_2yr bus_2yr educ_2yr und_2yr {
	preserve
	keep if `m'==1 
	outreg2 using "$output_desc/tables/desc_2yr_major.txt", append keep(z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading male white hispanic black asian gift atrisk econ_disad wage_5to10_zero wage_10to15_zero wage_15to20_zero) sum(detail) eqkeep(N mean sd) dec(3) label cttop("`m'")
	
	outreg2 using "$output_desc/tables/desc_2yr_major.xls", append keep(z_math z_reading rank_90_math rank_7090_math rank_90_reading rank_7090_reading male white hispanic black asian gift atrisk econ_disad wage_5to10_zero wage_10to15_zero wage_15to20_zero) sum(detail) eqkeep(N mean sd) dec(3) label cttop("`m'")
	restore 
	}

 
*2. Create table with proportion of sample in each major 

** Create indicator for double major 
egen major_sum = rowtotal(lib_2yr ag_2yr com_2yr it_2yr voc_2yr eng_2yr bio_2yr sci_2yr soc_2yr bus_2yr und_2yr educ_2yr)
gen double_2yr = major_sum > 1

label variable double_2yr "Double Major 2yr Sector"

outreg2 using "$output_desc/tables/prop_2yr_major.txt", replace keep(lib_2yr ag_2yr com_2yr it_2yr voc_2yr eng_2yr bio_2yr sci_2yr soc_2yr bus_2yr educ_2yr und_2yr double_2yr) eqkeep(N mean) dec(3) label sum(detail)

outreg2 using "$output_desc/tables/prop_2yr_major.xls", replace keep(lib_2yr ag_2yr com_2yr it_2yr voc_2yr eng_2yr bio_2yr sci_2yr soc_2yr bus_2yr educ_2yr und_2yr double_2yr) eqkeep(N mean) dec(3) label sum(detail)

 



/** SECTION 2. COUNTS BY AGGREGATE AND CIP2 MAJOR GROUPS **/ 


// 4-YEAR SAMPLE 

use $data/collapsed_data_4yr.dta, clear
gen dummy = 1 

* Loop through each aggregate major group 
foreach maj in `majors_4yr' {

* First majors 
preserve 
collapse (sum) dummy if `maj'==1 & which_maj_`maj'==1, by(maj_4_4yr)
gen major = "`maj'" 
gen which_major = 1
save $tables/counts_major_group_`maj'.dta, replace 
restore 

preserve
collapse (sum) dummy if `maj'==1 & which_maj_`maj'==2, by(sec_maj_4_4yr)
gen major = "`maj'" 
gen which_major = 2
rename sec_maj_4_4yr maj_4_4yr
append using $tables/counts_major_group_`maj'.dta
save $tables/counts_major_group_`maj'.dta, replace 
restore 
} 
 

* Append all majors together and save 
use $tables/counts_major_group_ag_4yr.dta, clear 

foreach maj in lib_4yr com_4yr it_4yr voc_4yr eng_4yr bio_4yr sci_4yr soc_4yr bus_4yr und_4yr {
	append using $tables/counts_major_group_`maj'.dta 
}

* Create variable for 2 digit CIP code. 
gen cip2 = substr(maj_4_4yr, 1, 2)

* Not all aggregate groups are determined by 2-digit (Social Sciences = 45). Replace for those. 
gen cip_use = cip2 
replace cip_use = maj_4_4yr if cip2=="45" 

* Collapse by CIP4 (across first and secon majors) 
collapse (sum) dummy, by(major cip_use maj_4_4yr) 
rename dummy count 
rename cip_use cip2_use 
rename maj_4_4yr cip4_use 
rename major maj_group 
save $tables/counts_major_cip4_4yr.dta, replace 

* Collapse again by CIP2 and aggregate major group 
collapse (sum) count, by(maj_group cip2_use)
save $tables/counts_major_group_4yr.dta, replace 

* Drop groups less than 30 
drop if count<30 

* Export to excel file 
export excel using $tables/counts_major_group_4yr, firstrow(variables) replace 

* Remove all temp files 
foreach maj in `majors_4yr' {
	rm $tables/counts_major_group_`maj'.dta
}


// 2-YEAR SAMPLE 

use $data/collapsed_data_2yr.dta, clear
gen dummy = 1 

* Loop through each aggregate major group 
foreach maj in `majors_2yr' {

* First majors 
preserve 
collapse (sum) dummy if `maj'==1 & which_maj_`maj'==1, by(maj_4_2yr)
gen major = "`maj'" 
gen which_major = 1
save $tables/counts_major_group_`maj'.dta, replace 
restore 

preserve
collapse (sum) dummy if `maj'==1 & which_maj_`maj'==2, by(sec_maj_4_2yr)
gen major = "`maj'" 
gen which_major = 2
rename sec_maj_4_2yr maj_4_2yr
append using $tables/counts_major_group_`maj'.dta
save $tables/counts_major_group_`maj'.dta, replace 
restore 
} 
 

* Append all majors together and save 
use $tables/counts_major_group_ag_2yr.dta, clear 

foreach maj in com_2yr it_2yr voc_2yr eng_2yr bio_2yr sci_2yr soc_2yr bus_2yr educ_2yr und_2yr lib_2yr {
	append using $tables/counts_major_group_`maj'.dta 
}

* Create variable for 2 digit CIP code. 
gen cip2 = substr(maj_4_2yr, 1, 2)

* Not all aggregate groups are determined by 2-digit (Social Sciences = 45). Replace for those. 
gen cip_use = cip2 
replace cip_use = maj_4_2yr if cip2=="45" 

* Collapse by CIP4 (across first and secon majors) 
collapse (sum) dummy, by(major cip_use maj_4_2yr) 
rename dummy count 
rename cip_use cip2_use 
rename maj_4_2yr cip4_use 
rename major maj_group 
save $tables/counts_major_cip4_2yr.dta, replace 

* Collapse again by CIP2 and aggregate major group 
collapse (sum) count, by(maj_group cip2_use)
save $tables/counts_major_group_2yr.dta, replace 

* Drop groups less than 30 
drop if count<30 

* Export to excel file 
export excel using $tables/counts_major_group_2yr, firstrow(variables) replace 

* Remove all temp files 
foreach maj in `majors_2yr' {
	rm $tables/counts_major_group_`maj'.dta
}

/** END **/ 

log close
